diff --git a/swh/web/misc/coverage.py b/swh/web/misc/coverage.py
index 97cb38e1..7682ede3 100644
--- a/swh/web/misc/coverage.py
+++ b/swh/web/misc/coverage.py
@@ -1,394 +1,391 @@
# Copyright (C) 2018-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import Counter, defaultdict
from functools import lru_cache
from typing import Dict, List, Tuple
from urllib.parse import urlparse
import sentry_sdk
from django.conf.urls import url
from django.shortcuts import render
from django.views.decorators.clickjacking import xframe_options_exempt
from swh.scheduler.model import SchedulerMetrics
from swh.web.common import archive
from swh.web.common.utils import get_deposits_list, reverse
from swh.web.config import scheduler
_swh_arch_overview_doc = (
"https://docs.softwareheritage.org/devel/architecture/overview.html"
)
# Current coverage list of the archive in a high level overview fashion,
# categorized as follow:
# - listed origins: origins discovered using a swh lister
# - legacy: origins where public hosting service has closed
# - deposited: origins coming from swh-deposit
#
# TODO: Store that list in a database table somewhere (swh-scheduler, swh-storage ?)
# and retrieve it dynamically
listed_origins = {
"info": (
"These software origins get continuously discovered and archived using "
f'the listers implemented by Software Heritage.'
),
"origins": [
{
"type": "bitbucket",
"info_url": "https://bitbucket.org",
"info": "public repositories from Bitbucket",
"search_pattern": "https://bitbucket.org/",
},
{
"type": "cgit",
"info_url": "https://git.zx2c4.com/cgit/about",
"info": "public repositories from cgit instances",
"search_pattern": "cgit",
},
{
"type": "CRAN",
"info_url": "https://cran.r-project.org",
"info": "source packages from The Comprehensive R Archive Network",
"search_pattern": "https://cran.r-project.org/",
},
{
"type": "debian",
"info_url": "https://www.debian.org",
"info": "source packages from the Debian distribution",
"search_pattern": "deb://",
},
{
"type": "gitea",
"info_url": "https://gitea.io",
"info": "public repositories from Gitea instances",
"search_pattern": "gitea",
},
{
"type": "github",
"info_url": "https://github.com",
"info": "public repositories from GitHub",
"search_pattern": "https://github.com/",
},
{
"type": "gitlab",
"info_url": "https://gitlab.com",
"info": "public repositories from multiple GitLab instances",
"search_pattern": "gitlab",
},
{
"type": "guix",
"info_url": "https://guix.gnu.org",
"info": "source code tarballs used to build the Guix package collection",
"visit_types": ["nixguix"],
"search_pattern": "https://guix.gnu.org/sources.json",
},
{
"type": "GNU",
"info_url": "https://www.gnu.org",
"info": "releases from the GNU project (as of August 2015)",
"search_pattern": "gnu",
},
{
"type": "heptapod",
"info_url": "https://heptapod.net/",
"info": "public repositories from multiple Heptapod instances",
"search_pattern": "heptapod",
},
{
"type": "launchpad",
"info_url": "https://launchpad.net",
"logo": "img/logos/launchpad.png",
"info": "public repositories from Launchpad",
"search_pattern": "https://git.launchpad.net/",
},
{
"type": "nixos",
"info_url": "https://nixos.org",
"info": "source code tarballs used to build the Nix package collection",
"visit_types": ["nixguix"],
"search_pattern": (
"https://nix-community.github.io/nixpkgs-swh/sources-unstable.json"
),
},
{
"type": "npm",
"info_url": "https://www.npmjs.com",
"info": "public packages from the package registry for javascript",
"search_pattern": "https://www.npmjs.com",
},
{
"type": "opam",
"info_url": "https://opam.ocaml.org/",
"info": "public packages from the source-based package manager for OCaml",
"search_pattern": "opam+https://opam.ocaml.org/",
},
# apart our forge, most phabricator origins have not been archived
# while they have been listed so do not display those type of origins
# until new listing processes have been executed and origins loaded
#
# {
# "type": "phabricator",
# "info_url": "https://www.phacility.com/phabricator",
# "info": "public repositories from multiple Phabricator instances",
# "search_pattern": "phabricator",
# },
{
"type": "pypi",
"info_url": "https://pypi.org",
"info": "source packages from the Python Package Index",
"search_pattern": "https://pypi.org",
},
{
"type": "sourceforge",
"info_url": "https://sourceforge.net",
"info": "public repositories from SourceForge",
"search_pattern": "code.sf.net",
},
],
}
legacy_origins = {
"info": (
"Discontinued hosting services. Those origins have been archived "
"by Software Heritage."
),
"origins": [
{
"type": "gitorious",
"info_url": "https://en.wikipedia.org/wiki/Gitorious",
"info": (
"public repositories from the former Gitorious code hosting service"
),
"visit_types": ["git"],
"search_pattern": "https://gitorious.org",
"count": "122,014",
},
{
"type": "googlecode",
"info_url": "https://code.google.com/archive",
"info": (
"public repositories from the former Google Code project "
"hosting service"
),
"visit_types": ["git", "hg", "svn"],
"search_pattern": "googlecode.com",
"count": "790,026",
},
{
"type": "bitbucket",
"info_url": "https://bitbucket.org",
"info": "public repositories from Bitbucket",
"search_pattern": "https://bitbucket.org/",
"visit_types": ["hg"],
"count": "336,795",
},
],
}
deposited_origins = {
"info": (
"These origins are directly pushed into the archive by trusted partners "
f'using the deposit service of Software Heritage.'
),
"origins": [
{
"type": "elife",
"info_url": "https://elifesciences.org",
"info": (
"research software source code associated to the articles "
"eLife publishes"
),
"search_pattern": "elife.stencila.io",
"visit_types": ["deposit"],
},
{
"type": "hal",
"info_url": "https://hal.archives-ouvertes.fr",
"info": "scientific software source code deposited in the open archive HAL",
"visit_types": ["deposit"],
"search_pattern": "hal.archives-ouvertes.fr",
},
{
"type": "ipol",
"info_url": "https://www.ipol.im",
"info": "software artifacts associated to the articles IPOL publishes",
"visit_types": ["deposit"],
"search_pattern": "doi.org/10.5201",
},
],
}
@lru_cache()
def _get_listers_metrics() -> Dict[str, List[Tuple[str, SchedulerMetrics]]]:
"""Returns scheduler metrics in the following mapping:
Dict[lister_name, List[Tuple[instance_name, SchedulerMetrics]]]
as a lister instance has one SchedulerMetrics object per visit type.
"""
listers_metrics = defaultdict(list)
try:
listers = scheduler().get_listers()
scheduler_metrics = scheduler().get_metrics()
for lister in listers:
for metrics in filter(
lambda m: m.lister_id == lister.id, scheduler_metrics
):
listers_metrics[lister.name].append((lister.instance_name, metrics))
except Exception as e:
sentry_sdk.capture_exception(e)
return listers_metrics
@lru_cache()
def _get_deposits_netloc_counts() -> Counter:
"""Return deposit counts per origin url network location.
"""
def _process_origin_url(origin_url):
parsed_url = urlparse(origin_url)
netloc = parsed_url.netloc
# special treatment for doi.org netloc as it is not specific enough
# for origins mapping
if parsed_url.netloc == "doi.org":
netloc += "/" + parsed_url.path.split("/")[1]
return netloc
netlocs = []
try:
deposits = get_deposits_list()
netlocs = [
_process_origin_url(d["origin_url"])
for d in deposits
if d["status"] == "done"
]
except Exception as e:
sentry_sdk.capture_exception(e)
return Counter(netlocs)
@lru_cache()
def _get_nixguix_origins_count(origin_url: str) -> int:
"""Returns number of archived tarballs for NixOS, aka the number
of branches in a dedicated origin in the archive.
"""
snapshot = archive.lookup_latest_origin_snapshot(origin_url)
if snapshot:
snapshot_sizes = archive.lookup_snapshot_sizes(snapshot["id"])
return snapshot_sizes["revision"]
else:
return 0
def _search_url(query: str, visit_type: str) -> str:
return reverse(
"browse-search",
query_params={
"q": query,
"visit_type": visit_type,
"with_visit": "true",
"with_content": "true",
},
)
@xframe_options_exempt
def _swh_coverage(request):
listers_metrics = _get_listers_metrics()
for origins in listed_origins["origins"]:
origins["instances"] = {}
origins_type = origins["type"]
# special processing for nixos/guix origins as there is no
# scheduler metrics for those
if origins_type in ("nixos", "guix"):
count = _get_nixguix_origins_count(origins["search_pattern"])
- origins["count"] = count
+ origins["count"] = f"{count:,}" if count else ""
origins["instances"][origins_type] = {"nixguix": {"count": count}}
if origins_type not in listers_metrics:
continue
count_total = sum(
[metrics.origins_known for _, metrics in listers_metrics[origins_type]]
)
count_never_visited = sum(
[
metrics.origins_never_visited
for _, metrics in listers_metrics[origins_type]
]
)
count = count_total - count_never_visited
origins["count"] = f"{count:,}"
origins["instances"] = defaultdict(dict)
for instance, metrics in listers_metrics[origins_type]:
# not yet in production
if metrics.visit_type in ("bzr", "cvs"):
continue
+ instance_count = metrics.origins_known - metrics.origins_never_visited
origins["instances"][instance].update(
- {
- metrics.visit_type: {
- "count": metrics.origins_known - metrics.origins_never_visited
- }
- }
+ {metrics.visit_type: {"count": f"{instance_count:,}"}}
)
origins["visit_types"] = list(
set(origins["instances"][instance].keys())
| set(origins.get("visit_types", []))
)
if origins_type == "CRAN":
origins["instances"]["cran"]["cran"] = {"count": origins["count"]}
# defaultdict cannot be iterated in django template
origins["instances"] = dict(origins["instances"])
for origins in listed_origins["origins"]:
instances = origins["instances"]
nb_instances = len(instances)
for instance_name, visit_types in instances.items():
for visit_type in visit_types:
if nb_instances > 1:
search_pattern = instance_name
else:
search_pattern = origins["search_pattern"]
search_url = _search_url(search_pattern, visit_type)
visit_types[visit_type]["search_url"] = search_url
for origins in legacy_origins["origins"]:
origins["search_urls"] = {}
for visit_type in origins["visit_types"]:
origins["search_urls"][visit_type] = _search_url(
origins["search_pattern"], visit_type
)
deposits_counts = _get_deposits_netloc_counts()
for origins in deposited_origins["origins"]:
if origins["search_pattern"] in deposits_counts:
origins["count"] = f"{deposits_counts[origins['search_pattern']]:,}"
origins["search_urls"] = {
"deposit": _search_url(origins["search_pattern"], "deposit")
}
return render(
request,
"misc/coverage.html",
{
"origins": {
"Regular crawling": listed_origins,
"Discontinued hosting": legacy_origins,
"On demand archival": deposited_origins,
}
},
)
urlpatterns = [
url(r"^coverage/$", _swh_coverage, name="swh-coverage"),
]
diff --git a/swh/web/templates/misc/coverage.html b/swh/web/templates/misc/coverage.html
index 5a6c9658..6121eaa5 100644
--- a/swh/web/templates/misc/coverage.html
+++ b/swh/web/templates/misc/coverage.html
@@ -1,151 +1,151 @@
{% comment %}
Copyright (C) 2015-2021 The Software Heritage developers
See the AUTHORS file at the top-level directory of this distribution
License: GNU Affero General Public License version 3, or any later version
See top-level LICENSE file for more information
{% endcomment %}
{% load js_reverse %}
{% load static %}
{% load render_bundle from webpack_loader %}
Software Heritage archive coverage
{% render_bundle 'vendors' %}
{% render_bundle 'webapp' %}
A significant amount of source code has already been ingested in the Software Heritage
archive. It notably includes the following software origins.
{% for origins_type, origins_data in origins.items %}
{{ origins_type }}
{{ origins_data.info | safe }}
{% for origins in origins_data.origins %}
{% with 'img/logos/'|add:origins.type.lower|add:'.png' as png_logo %}
{% endwith %}
{% if "instances" in origins %}
instance |
type |
count |
search |
{% for instance, visit_types in origins.instances.items %}
{% for visit_type, data in visit_types.items %}
{% if data.count %}
{{ instance }} |
{{ visit_type }} |
{{ data.count }} |
|
{% endif %}
{% endfor %}
{% endfor %}
{% else %}
instance |
type |
search |
{% for visit_type, search_url in origins.search_urls.items %}
{{ origins.type }} |
{{ visit_type }} |
|
{% endfor %}
{% endif %}
{% endfor %}
{% endfor %}
JavaScript license information